#install.packages("knitr")
#install.packages("grid")
#install.packages("plotly")
#install.packages("dprep")
#install.packages("normalr")
#install.packages("ggcorrplot")

#install.packages("RColorBrewer")
#install.packages("rgdal")
#install.packages("jsonlite")
#install.packages("readr")
#install.packages("readr")
library(gridExtra)
library(dplyr)
library(lubridate)
library(magrittr)
library(ggplot2)
library(tidyr)
library(knitr)
#library(normalr)
library(ggcorrplot)

library(leaflet)
library(plotly)
library(RColorBrewer)
library(readr)
#library(MLRMPA)
#??src_mysql
my_db <- src_mysql(
  dbname = "coronavirus",
  host = "localhost"
)
my_db
src:  mysql 10.4.17-MariaDB [root@localhost:/coronavirus]
tbls: avg_world_temp_2020, covid19_confirmed, covid19_deaths, covid19_recovered, covid19_thailand,
  covidus, data_gender, data_lockdown, data_population, gdp_us, gdp19, healthranking, population,
  pornhub, sars_2003_update, time_series_covid19_confirmed_global, time_series_covid19_deaths_global,
  time_series_covid19_recovered_global, us_homeless
##import data
df_conf <- tbl(my_db, sql("select * from time_series_covid19_confirmed_global"))
df_conf <- as.data.frame(df_conf)
df_conf
df_deaths <- tbl(my_db, sql("select * from time_series_covid19_deaths_global"))
df_deaths <- as.data.frame(df_deaths)
df_deaths
df_recover <- tbl(my_db, sql("select * from time_series_covid19_recovered_global"))
df_recover <- as.data.frame(df_recover)
df_recover
##check the time frame of the data
n.col <- ncol(df_conf)
dates <- names(df_conf)[5:n.col]%>% mdy()
range(dates)
[1] "2020-01-22" "2021-01-22"
min.date <- min(dates)
max.date <- max(dates)
min.date.txt <- min.date %>% format('%d %b %Y')
max.date.txt <- max.date %>% format('%d %b %Y')
#clean data
cleanData <- function(data) {
  ## remove some columns
  data %<>% select(-c(Province.State, Lat, Long)) %>% rename(country=Country.Region)
  ## convert from wide to long format
  data %<>% gather(key=date, value=count, -country)
  ## convert from character to date
  data %<>% mutate(date = date %>% mdy())
  ## aggregate by country
  data %<>% group_by(country, date) %>% summarise(count=sum(count, na.rm=T)) %>% as.data.frame()
  return(data)
}
## clean the three data sets
data.confirmed <- df_conf %>% cleanData() %>% rename(confirmed=count)
`summarise()` regrouping output by 'country' (override with `.groups` argument)
data.deaths <- df_deaths %>% cleanData() %>% rename(deaths=count)
`summarise()` regrouping output by 'country' (override with `.groups` argument)
data.recovered <- df_recover %>% cleanData() %>% rename(recovered=count)
`summarise()` regrouping output by 'country' (override with `.groups` argument)
data <- data.confirmed %>% merge(data.deaths, all=T) %>% merge(data.recovered, all=T)
data
## countries/regions with confirmed cases, excl. cruise ships
countries <- data %>% pull(country) %>% setdiff('Cruise Ship')
data 
data.world <- data %>% group_by(date) %>%
  summarise(country='World',
            confirmed = sum(confirmed, na.rm=T),
            deaths = sum(deaths, na.rm=T),
            recovered = sum(recovered, na.rm=T))
`summarise()` ungrouping output (override with `.groups` argument)
data %<>% rbind(data.world)
data
data %<>% mutate(current.confirmed = confirmed - deaths - recovered)
data
NA
#rate
data %<>% arrange(country, date)
n <- nrow(data)
day1 <- min(data$date)
data %<>% mutate(new.confirmed = ifelse(date == day1, NA, confirmed - lag(confirmed, n=1)),
                 new.deaths = ifelse(date == day1, NA, deaths - lag(deaths, n=1)),
                 new.recovered = ifelse(date == day1, NA, recovered - lag(recovered, n=1)))
data %<>% mutate(new.confirmed = ifelse(new.confirmed < 0, 0, new.confirmed),
                 new.deaths = ifelse(new.deaths < 0, 0, new.deaths),
                 new.recovered = ifelse(new.recovered < 0, 0, new.recovered))
## death rate based on total deaths and recovered cases
data %<>% mutate(rate.upper = (100 * deaths / (deaths + recovered)) %>% round(1))
## lower bound: death rate based on total confirmed cases
data %<>% mutate(rate.lower = (100 * deaths / confirmed) %>% round(1))
## death rate based on the number of death/recovered on every single day
data %<>% mutate(rate.daily = (100 * new.deaths / (new.deaths + new.recovered)) %>% round(1))
View(data)
## convert from wide to long format
data.long <- data %>%
  select(c(country, date, confirmed, current.confirmed, recovered, deaths)) %>%
  gather(key=type, value=count, -c(country, date))
## set factor levels to show them in a desirable order
data.long %<>% mutate(type=recode_factor(type, confirmed='Total Confirmed',
                                         current.confirmed='Current Confirmed',
                                         recovered='Recovered',
                                         deaths='Deaths'))
View(data.long)
##Number of case World
world <- filter(data.long,country == 'World')
plot1 <- world %>% filter(type != 'Total Confirmed') %>%
  ggplot(aes(x=date, y=count)) +
  geom_area(aes(fill=type), alpha=0.5) +
  labs(title=paste0('Numbers of Cases Worldwide - ', max.date.txt)) +
  scale_fill_manual(values=c('red', 'green', 'black')) +
  theme(legend.title=element_blank(), legend.position='bottom',
        plot.title = element_text(size=7),
        axis.title.x=element_blank(),
        axis.title.y=element_blank(),
        legend.key.size=unit(0.2, 'cm'),
        legend.text=element_text(size=6),
        axis.text=element_text(size=7),
        axis.text.x=element_text(angle=45, hjust=1))
plot2 <- world %>%
  ggplot(aes(x=date, y=count)) +
  geom_line(aes(color=type)) +
  labs(title=paste0('Numbers of Cases Worldwide (log scale) - ', max.date.txt)) +
  scale_color_manual(values=c('purple', 'red', 'green', 'black')) +
  theme(legend.title=element_blank(), legend.position='bottom',
        plot.title = element_text(size=14),
        axis.title.x=element_blank(),
        axis.title.y=element_blank(),
        legend.key.size=unit(0.2, 'cm'),
        legend.text=element_text(size=14),
        axis.text=element_text(size=14),
        axis.text.x=element_text(angle=45, hjust=1)) +
  scale_y_continuous(trans='log10')
## show two plots side by side
grid.arrange(plot1, plot2, ncol=2)

## Current Confirmed Cases
data.world <- data %>% filter(country=='World')
n <- nrow(data.world)
plot1 <- ggplot(data.world, aes(x=date, y=current.confirmed)) +
  geom_point() + geom_smooth() +
  xlab('') + ylab('Count') + labs(title='Current Confirmed Cases') +
  theme(axis.text.x=element_text(angle=45, hjust=1))
plot2 <- ggplot(data.world, aes(x=date, y=new.confirmed)) +
  geom_point() + geom_smooth() +
  xlab('') + ylab('Count') + labs(title='Daily New Confirmed Cases') +
  theme(axis.text.x=element_text(angle=45, hjust=1))
## show two plots side by side
grid.arrange(plot1, plot2, ncol=2)

View(data.world)
## a scatter plot with a smoothed line and vertical x-axis labels
plot1 <- ggplot(data.world, aes(x=date, y=deaths)) +
  geom_point() + geom_smooth() +
  xlab('') + ylab('Count') + labs(title='Accumulative Deaths') +
  theme(axis.text.x=element_text(angle=45, hjust=1))
plot2 <- ggplot(data.world, aes(x=date, y=recovered)) +
  geom_point() + geom_smooth() +
  xlab('') + ylab('Count') + labs(title='Accumulative Recovered Cases') +
  theme(axis.text.x=element_text(angle=45, hjust=1))
plot3 <- ggplot(data.world, aes(x=date, y=new.deaths)) +
  geom_point() + geom_smooth() +
  xlab('') + ylab('Count') + labs(title='New Deaths') +
  theme(axis.text.x=element_text(angle=45, hjust=1))
plot4 <- ggplot(data.world, aes(x=date, y=new.recovered)) +
  geom_point() + geom_smooth() +
  xlab('') + ylab('Count') + labs(title='New Recovered Cases') +
  theme(axis.text.x=element_text(angle=45, hjust=1))
## show four plots together, with 2 plots in each row
grid.arrange(plot1, plot2, plot3, plot4, nrow=2)

## convert from wide to long format, for drawing area plots
rates.long <- data %>%
  select(c(country, date, rate.upper, rate.lower, rate.daily)) %>%
  gather(key=type, value=count, -c(country, date))
# set factor levels to show them in a desirable order
rates.long %<>% mutate(type=recode_factor(type, rate.daily='Daily',
                             rate.upper='Upper bound'))
## ranking by confirmed cases
data.latest.all <- data %>% filter(date == max(date)) %>%
  select(country, date,confirmed, new.confirmed, current.confirmed,
         recovered, deaths, new.deaths, death.rate=rate.lower) %>%
  mutate(ranking = dense_rank(desc(confirmed)))
#View(data.latest.all)
k <- 20
## top 20 countries: 21 incl. 'World'
top.countries <- data.latest.all %>% filter(ranking <= k + 1) %>%
  arrange(ranking) %>% pull(country) %>% as.character()
top.countries %>% setdiff('World') %>% print()
 [1] "US"             "India"          "Brazil"         "Russia"         "United Kingdom"
 [6] "France"         "Spain"          "Italy"          "Turkey"         "Germany"       
[11] "Colombia"       "Argentina"      "Mexico"         "Poland"         "South Africa"  
[16] "Iran"           "Ukraine"        "Peru"           "Indonesia"      "Netherlands"   
data.latest <- data.latest.all %>% filter(!is.na(country)) %>%
  mutate(country=ifelse(ranking <= k + 1, as.character(country), 'Others')) %>%
  mutate(country=country %>% factor(levels=c(top.countries, 'Others')))
data.latest %<>% group_by(country) %>%
  summarise(confirmed=sum(confirmed), new.confirmed=sum(new.confirmed),
            current.confirmed=sum(current.confirmed),
            recovered=sum(recovered), deaths=sum(deaths), new.deaths=sum(new.deaths)) %>%
  mutate(death.rate=(100 * deaths/confirmed) %>% round(1)) 
`summarise()` ungrouping output (override with `.groups` argument)
data.latest
data.latest %<>% select(c(country, confirmed, deaths, death.rate,
                          new.confirmed, new.deaths, current.confirmed,recovered)) %>%
  mutate(recover.rate=(100 * recovered/confirmed) %>% round(1))
data.latest
df_pop <- tbl(my_db, sql("select * from population "))
df_pop <- as.data.frame(df_pop)
df_pop <- rename(df_pop,"country"="Country")
df_pop
data.latest <- merge(x = data.latest, y = df_pop, by = "country", all.x = TRUE) 
data.latest <- rename(data.latest,"population" = "Population (2020)")
data.latest
data.latest  %<>% select(c(country, confirmed, deaths, death.rate,
                          new.confirmed, new.deaths,
                          current.confirmed,recovered,recover.rate,population)) %>%
  mutate(confirm.rate=(100 *confirmed/population) %>% round(1))
data.latest
NA
data.latest %>% mutate(death.rate=death.rate %>% format(nsmall=1) %>% paste0('%'))
NA
NA
## convert from wide to long format, for drawing area plots
data.latest.long <- data.latest %>% filter(country!='World') %>%
  gather(key=type, value=count, -country)
## set factor levels to show them with proper text and in a desirable order
data.latest.long %<>% mutate(type=recode_factor(type,
                                                confirmed='Total Confirmed',
                                                deaths='Total Deaths',
                                                death.rate='Death Rate (%)',
                                                new.confirmed='New Confirmed (compared with one day before)',
                                                new.deaths='New Deaths (compared with one day before)',
                                                current.confirmed='Current Confirmed',
                                                recover.rate = 'Recover Rate(%)',
                                                confirm.rate = 'Confirmed Rate(%)'))
#View(data.latest.long)
data.one.dem <- filter(data.latest.long,type=='Total Confirmed'
                       | type=='Total Deaths'
                       | type=='Current Confirmed')
data.two.dem <- filter(data.latest.long,type=='Confirmed Rate(%)'
                       #| type=='New Confirmed (compared with one day before)'
                       #| type=='New Deaths (compared with one day before)'
                       | type=='Death Rate (%)'
                       | type=='Recover Rate(%)')
data.two.dem
## bar chart
data.one.dem %>% ggplot(aes(x=country, y=count, fill=country, group=country)) +
  geom_bar(stat='identity') +
  geom_text(aes(label=count, y=count), size=2, vjust=0) +
  xlab('') + ylab('') +
  labs(title=paste0('Top 20 Countries with Most Confirmed Cases - ', max.date.txt)) +
  scale_fill_discrete(name='Country', labels=aes(count)) +
  theme(legend.title=element_blank(),
        legend.position='none',
        plot.title=element_text(size=11),
        axis.text=element_text(size=7),
        axis.text.x=element_text(angle=45, hjust=1)) +
  facet_wrap(~type, ncol=1, scales='free_y')

data.two.dem$facet <- factor(data.two.dem$type, levels = c('Confirmed Rate(%)', 'Recover Rate(%)','Death Rate (%)'))
data.two.dem %>% 
  ggplot(aes(x=country, y=count, fill=country, group=country)) +
  geom_bar(stat='identity') +
  geom_text(aes(label=count, y=count), size=2, vjust=0) +
  xlab('') + ylab('') +
  labs(title=paste0('Top 20 Countries with Most Confirmed Cases - ', max.date.txt)) +
  scale_fill_discrete(name='Country', labels=aes(count)) +
  theme(legend.title=element_blank(),
        legend.position='none',
        plot.title=element_text(size=11),
        axis.text=element_text(size=9),
        axis.text.x=element_text(size=6,angle=45, hjust=1)) +
  facet_wrap(~facet, ncol=1, scales='free_y')

#df_gdp
df_gdp2019 <- tbl(my_db, sql("select * from gdp19"))
df_gdp2019 <- as.data.frame(df_gdp2019)
df_gdp2019
NA
#healthranking
df_healt <- tbl(my_db, sql("select * from healthranking"))
df_healt <- as.data.frame(df_healt)
df_healt <- select(df_healt,c("country","healthCareIndex"))
df_healt
#Top20Pornhub
df_pornhub <- tbl(my_db, sql("select * from Pornhub"))
df_pornhub <- as.data.frame(df_pornhub)
df_pornhub
NA
#temp
df_temp <- tbl(my_db, sql("select * from Avg_World_Temp_2020"))
df_temp <- as.data.frame(df_temp)
df_city <- select(df_temp,c("Country","City")) %>%
  rename(country=Country) %>% 
  rename(city=City)
numofcity <- aggregate(city ~ country, data = df_city, length)
df_temp <- select(df_temp,c("Country","Apr","May","Jun","Jul","Aug")) %>%
  rename(country=Country)
df_temp <- data.frame(country=df_temp[,1],avg=rowMeans(df_temp[,-1]))
df_temp <- df_temp %<>% group_by(country) %>% summarise(avg_temp = mean(avg,na.rm = TRUE))
`summarise()` ungrouping output (override with `.groups` argument)
df_temp <- df_temp %>% mutate(country=ifelse(country=="United States","US", country ) ) 
df_temp$avg_temp <- df_temp$avg_temp %>% 
  sprintf(df_temp$avg_temp, fmt = '%#.1f') %>%
  as.numeric(df_temp$avg_temp)
df_temp
#Top 20 with gdp
data.longGDP <- df_gdp %>% gather(key=year, value=GDP, -c(country))
data.top <- data.latest %>% filter(country!='World')
data.top <- head(data.top,20)
View(data.latest)
#merge
mergcountry = function(data1,data2){
  data <- merge(x = data1, y = data2, by = "country", all.x = TRUE) 
  return(data)
}
data.top.world <- merge(x = data.top, y = df_gdp2019, by = "country", all.x = TRUE) %>% 
  select(-c(code,rank,new.confirmed,new.deaths,current.confirmed,population)) %>% 
  rename(GDP="GDP (millions of US dollars)")

data.top.world <- merge(x = data.top.world, y = df_healt, by = "country", all.x = TRUE) %>%
  rename(healthcare="healthCareIndex")
#data.top.world <- mergcountry(data.top.world, df_temp)

data.top.world <- merge(x = data.top.world, y = df_pornhub, by = "country", all.x = TRUE) %>%
  rename(Pornhub = "PornhubIndex(%)")

data.top.world <- mergcountry(data.top.world, df_temp)
index <- is.na(data.top.world)
data.top.world[index] <- 0
data.top.world
View(data.top.world)

normalize = function(data){
  #return ((data - min(data,na.rm = TRUE))/(max(data,na.rm = TRUE) - min(data,na.rm = TRUE)))
  z <- scale(data);
  tanh(z/2)
}
norm_data = as.data.frame(apply(data.top.world[,2:12],2,normalize))
corr_data <- norm_data
norm_data$country <- c("Argentina","Bangladesh","Brazil","Chile","Colombia","France","Germany","India","Iran","Italy","Mexico","Pakistan","Peru","Russia","saudi Arabia","South Africa","Spain","Turkey","United Kingdom","US")
#View(norm_data)


norm_data_plot <- select(norm_data,"country","confirm.rate","death.rate","recover.rate","healthcare","Pornhub","GDP","avg_temp")
norm_data_plot %<>% gather(key=type, value=count, -c(country))
level_order <- factor(norm_data_plot$type, 
                      level = c("GDP","avg_temp","healthcare","recover.rate","death.rate","confirm.rate","Pornhub"))
ggplot(data = norm_data_plot, aes(x=country, y=level_order, fill=count)) + 
  geom_tile() +
  scale_fill_gradient(low = "pink", high = "blue") +
  xlab("") +
  ylab("") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90,vjust = 1))+
  theme(
    axis.line = element_blank(),
    axis.ticks = element_blank(),
    panel.grid.minor = element_blank(),
    panel.grid.major = element_blank(),
    panel.border = element_blank(),
    panel.background = element_blank(),
    #legend.position = "none"
  )

NA
NA
#correlation
corr_data %<>% select(c(GDP,confirm.rate,death.rate,recover.rate,healthcare,avg_temp,Pornhub))
head(corr_data)
cor(corr_data)
                    GDP confirm.rate  death.rate recover.rate  healthcare    avg_temp     Pornhub
GDP           1.0000000    0.4342894 -0.23113988   -0.3937431  0.25183938  0.15986445  0.69972226
confirm.rate  0.4342894    1.0000000 -0.34422360   -0.7182585  0.48153104 -0.36761397  0.44060606
death.rate   -0.2311399   -0.3442236  1.00000000    0.1733012 -0.17258588  0.26319481 -0.06882314
recover.rate -0.3937431   -0.7182585  0.17330123    1.0000000 -0.76240251  0.12250932 -0.40581644
healthcare    0.2518394    0.4815310 -0.17258588   -0.7624025  1.00000000 -0.05087304  0.35166155
avg_temp      0.1598645   -0.3676140  0.26319481    0.1225093 -0.05087304  1.00000000  0.10674647
Pornhub       0.6997223    0.4406061 -0.06882314   -0.4058164  0.35166155  0.10674647  1.00000000
ggcorrplot(cor(corr_data),hc.order = TRUE,
           outline.color = "white",
           colors = c("#6D9EC1","white","#E46726"),
           lab = TRUE)

df <- data.long %>% filter(country %in% top.countries) %<>%
  mutate(country=country %>% factor(levels=c(top.countries)))
Error in data.long %>% filter(country %in% top.countries) <- list(country = c(13L,  : 
  could not find function "%>%<-"
p <- df %>% filter(country != 'World') %>%
  ggplot(aes(x=date, y=count, color=type)) +
  geom_line() +
  labs(title=paste0('Numbers of COVID-19 Cases in Top 20 Countries (log scale) - ',
                    max.date.txt)) +
  scale_color_manual(values=c('purple', 'red', 'green', 'black')) +
  theme(legend.title=element_blank(), legend.position='bottom',
        plot.title = element_text(size=10),
        axis.title.x=element_blank(),
        axis.title.y=element_blank(),
        legend.key.size=unit(0.4, 'cm'),
        legend.text=element_text(size=10),
        strip.text.x=element_text(size=10),
        axis.text=element_text(size=10),
        axis.text.x=element_text(angle=45, hjust=1)) +
  scale_y_continuous(trans='log10')
p + facet_wrap(~country, ncol=4, scales='free_y')
data.world %<>% arrange(desc(date)) %>%
  select(c(date, confirmed, deaths, recovered, current.confirmed,new.confirmed, new.deaths, new.recovered, rate.lower, rate.upper, rate.daily))
data.world %>%
  mutate(rate.upper = rate.upper %>% format(nsmall=1) %>% paste0('\\%'),
         rate.lower = rate.lower %>% format(nsmall=1) %>% paste0('\\%'),
         rate.daily = rate.daily %>% format(nsmall=1) %>% paste0('\\%')) 
#sars_2003
df_sars <- tbl(my_db, sql("select * from sars_2003_update"))
df_sars <- as.data.frame(df_sars)
df_sars
## convert from character to date
#datesSar <- as.Date(df_sars$Date,format = "%m/%d/%y")

df_sars %<>%  mutate(Date = as.Date(df_sars$Date,format = "%m/%d/%y"))
df_sars
## convert from wide to long format
dataSar.long <- df_sars %>%
  select(c(Date, country, Cumulative_number , Number_deaths, Number_recovered)) %>%
  gather(key=type, value=count, -c(country, Date))
## set factor levels to show them in a desirable order
dataSar.long %<>% mutate(type=recode_factor(type, Cumulative_number ='Cumulative Number',
                                         Number_deaths ='Number of deaths',
                                         Number_recovered ='Number of recovered'))
View(dataSar.long)
 
#Covid_Thailand
df_thai <- tbl(my_db, sql("select * from covid19_Thailand"))
df_thai <- as.data.frame(df_thai)
View(df_thai)
#clean Covid_Thailand
dates.th <- df_thai[,2]%>% mdy()
range(dates.th)
[1] "2020-01-12" "2021-01-15"
min.date.th <- min(dates.th)
max.date.th <- max(dates.th)
min.date.txt.th <- min.date.th %>% format('%d %b %Y')
max.date.txt.th <- max.date.th %>% format('%d %b %Y')
df_thai$announce_date <- mdy(df_thai$announce_date)
df_thai$notification_date <- mdy(df_thai$notification_date)
df_thai 
df_thai <- df_thai %>% select(!No.) %>% select(!notification_date) %>% 
  group_by(announce_date)
Error: Can't subset columns that don't exist.
x Column `No.` doesn't exist.
# Total confirmed cases in Thailand
data.thai.count <- df_thai %>%
  select(announce_date) %>%
  summarise(comfirmed = n())  %>% as.data.frame()
`summarise()` ungrouping output (override with `.groups` argument)
data.thai.count$cumulative_confirmed <- cumsum(data.thai.count[, 2])
data.thai.count
## Thai Confirmed Cases (Jan 2020 - Jan 2021
plot1 <- ggplot(data.thai.count, aes(x=announce_date, y=cumulative_confirmed)) +
  geom_point() + geom_smooth() +
  xlab(" ") + ylab("Count") + labs(title='Thai Cumulative Confirmed Cases (Jan 2020 - Jan 2021)') +
  theme(axis.text.x=element_text(angle=45, hjust=1))
plot2 <- ggplot(data.thai.count, aes(x=announce_date, y=comfirmed)) +
  geom_point() + geom_smooth() +
    xlab(" ") + ylab("Count")+ labs(title='Thai Confirmed Cases (Jan 2020 - Jan 2021 log scale)') +
   theme(axis.text.x=element_text(angle=45, hjust=1))+scale_y_continuous(trans='log10')
## show two plots side by side
grid.arrange(plot1, plot2, ncol=1)

## Thai Confirmed Cases (Jan 2020 - Jan 2021) log scale
plot1 <- ggplot(data.thai.count, aes(x=announce_date, y=cumulative_confirmed)) +
  geom_point() + geom_smooth() +
  xlab(" ") + ylab("Count") + labs(title='Thai Cumulative Confirmed Cases (Jan 2020 - Jan 2021 log scale)') +
  theme(axis.text.x=element_text(angle=45, hjust=1))+scale_y_continuous(trans='log10')
plot2 <- ggplot(data.thai.count, aes(x=announce_date, y=comfirmed)) +
  geom_point() + geom_smooth() +
  xlab(" ") + ylab("Count")+ labs(title='Thai Confirmed Cases (Jan 2020 - Jan 2021 log scale)') +
  theme(axis.text.x=element_text(angle=45, hjust=1))+scale_y_continuous(trans='log10')
## show two plots side by side
grid.arrange(plot1, plot2, ncol=1)

View(df_thai)
# Confirmed cases divided by sex (gender)
df_thai$sex[df_thai$sex == ""] <- "Unknown"
data.thai.gender <- df_thai %>%
  group_by(sex) %>%
  summarise(count = n()) %>%
  mutate(percent = (count / sum(count) * 100) %>% round(2)) %>%
  filter(percent>1)%>%
  #mutate(pos = cumsum(percent) - 0.5*percent) %>%
  arrange(desc(percent))
`summarise()` ungrouping output (override with `.groups` argument)
data.thai.gender
data.thai.gender$sex <- factor(data.thai.gender$sex, levels = as.character(data.thai.gender$sex))
data.thai.gender$sex
[1] Female  Male    Unknown
Levels: Female Male Unknown
g.th.gender <- data.thai.gender %>% 
  ggplot(aes(x = "", y = percent, fill = sex)) +
  geom_bar(stat = "identity", width = 1) +
  coord_polar("y") +
  theme_void() +
  labs(title='Gender of Thai Confirmed Cases (Jan 2020 - Jan 2021)')+
  geom_text(aes(label = paste0(percent, "%")), color = "white", size = 5, position = position_stack(vjust = 0.5)) +
  guides(fill = guide_legend(reverse = TRUE)) 
g.th.gender

# Confirmed cases divided by province_of_onset
data.thai.onset <- df_thai %>%
    group_by(province_of_onset) %>%
    summarise(count = n()) %>%
    arrange(desc(count))%>%
    rename(onset="count")%>%
    rename(province="province_of_onset")
`summarise()` ungrouping output (override with `.groups` argument)
data.thai.onset$province[data.thai.onset$province == ""] <- "Unknown"
data.thai.onset
# Confirmed cases divided by province_of_onset
data.thai.isolation <- df_thai %>%
    group_by(province_of_isolation) %>%
    summarise(count = n()) %>%
    arrange(desc(count))%>%
    rename(isolation ="count")%>%
    rename(province="province_of_isolation")
`summarise()` ungrouping output (override with `.groups` argument)
data.thai.isolation$province[data.thai.isolation$province == ""] <- "Unknown"
data.thai.isolation
data.province <- merge(x = data.thai.onset, y = data.thai.isolation, by = "province", all.x = TRUE) 
data.province <- data.province%>%filter(onset>125)
data.province
data.province.long <- data.province %>%
    gather(key=type, value=count, -c(province))
data.province.long
ggplot(data.province.long,aes(x=province,y=count))+
    geom_bar(stat = "identity",position = "dodge",aes(fill=type))+
    labs(title = "Province of onset and Province of isolation in Thailand")+
    theme(legend.title=element_blank(),
                  #legend.position='none',
                  plot.title=element_text(size=12),
                  axis.text=element_text(size=10),
                  axis.text.x=element_text(angle=45, hjust=1))+ xlab('') + ylab('Count')

# Confirmed cases divided by risk
data.thai.risk <- df_thai %>%
  group_by(risk) %>%
  summarise(count = n()) %>%
  arrange(desc(count))
`summarise()` ungrouping output (override with `.groups` argument)
data.thai.risk
# Confirmed cases divided by risk
data.thai.risk <- df_thai %>%
  group_by(risk) %>%
  summarise(count = n()) %>%
  mutate(percent = (count / sum(count) * 100) %>% round(2)) %>%
  filter(percent>0.76) %>%
  arrange(desc(percent)) 
`summarise()` ungrouping output (override with `.groups` argument)
data.thai.risk
data.thai.risk$risk[data.thai.risk$risk == "C"] <- "Close contact with the patient"
data.thai.risk$risk[data.thai.risk$risk == "CGustDr Samut SaGGAF"] <- "Cluster Samut Sakhon"
data.thai.risk$risk[data.thai.risk$risk == "F"] <- "State Quarantine"
data.thai.risk$risk[data.thai.risk$risk == "G"] <- "Go to a crowded place"
data.thai.risk$risk[data.thai.risk$risk == "A"] <- "People travelling from abroad"
data.thai.risk$risk[data.thai.risk$risk == "UFGFAwF"] <- "Unknown"
data.thai.risk$risk[data.thai.risk$risk == "D"] <- "Career at risk"
data.thai.risk$risk[data.thai.risk$risk == "CGustDr RayAFg"] <- "Cluster Rayong"
data.thai.risk$risk[data.thai.risk$risk == "CGustDr Pattaya CasCFA"] <- "Cluster Pattaya Casino"
data.thai.risk$risk[data.thai.risk$risk == "CGustDr AFg TGAFg"] <- "Cluster Ang Thong"
data.thai.risk
data.thai.risk$risk <- factor(data.thai.risk$risk, levels = as.character(data.thai.risk$risk))
data.thai.risk$risk
[1] Cluster Samut Sakhon           Close contact with the patient
[3] Go to a crowded place          State Quarantine              
[5] People travelling from abroad  Unknown                       
[7] Cluster Rayong                 Career at risk                
[9] Cluster Pattaya Casino        
9 Levels: Cluster Samut Sakhon ... Cluster Pattaya Casino
g.th.risk <- data.thai.risk %>% 
  ggplot(aes(x = "", y = percent, fill = risk)) +
  geom_bar(stat = "identity", width = 0.5) +
  coord_polar("y") +
  theme_void() +
  labs(title='Risk of Thai Confirmed Cases(Jan 2020 - Jan 2021)')+
  geom_text(aes(label = paste0(percent, "%")), color = "Black", size = 3, position = position_stack(vjust = 0.5)) +
  guides(fill = guide_legend(reverse = TRUE)) 
g.th.risk

# Confirmed cases divided by age
data.thai.age <- df_thai %>%
  group_by(age,sex) %>% 
  filter(age != 0)%>%
  summarise(count = n()) %>%
  arrange(desc(count))
`summarise()` regrouping output by 'age' (override with `.groups` argument)
data.thai.age
ggplot(data.thai.age,aes(x=age,y=count,fill=sex))+geom_bar(stat = "identity")+
      labs(title='Age of Thai Confirmed Cases (Stack)')+guides(fill=guide_legend(reverse = T))

# Confirmed cases divided by nationality


data.thai.nationality <- df_thai %>%
  group_by(nationality) %>%
  summarise(count = n()) %>%
  filter(count > 11)%>%
  arrange(desc(count))
`summarise()` ungrouping output (override with `.groups` argument)
data.thai.nationality$nationality[data.thai.nationality$nationality == "????????"] <- "Unknown"
data.thai.nationality$nationality[data.thai.nationality$nationality == ""] <- "Unknown"
data.thai.nationality$nationality[data.thai.nationality$nationality == "Burma"] <- "Myanmar"
data.thai.nationality
ggplot(data.thai.nationality)+ 
  geom_bar(aes(x=nationality,y=count,fill = count),stat = "identity",show.legend = F)+
    ggtitle('Nationality of Thai Confirmed Cases (log scale)')+
    theme(plot.title = element_text(hjust = 0.5))+
    theme(axis.text = element_text(size = 10),
            axis.title = element_text(size = 10),
            plot.title = element_text(size = 10))+
    xlab("")+ylab("Number of Nationality")+
   scale_y_continuous(trans='log10')+coord_flip()+scale_fill_gradient(low="blue",high = "red")

#Covid_US
df_us <- tbl(my_db, sql("select * from covidUs"))
df_us <- as.data.frame(df_us)
df_us
#clean Covid_US
dates.us <- df_us[,2]%>% mdy()
range(dates.us)
[1] "2020-01-21" "2020-12-24"
min.date.us <- min(dates.us)
max.date.us <- max(dates.us)
min.date.txt.us <- min.date.us %>% format('%d %b %Y')
max.date.txt.us <- max.date.us %>% format('%d %b %Y')
df_us$date <- mdy(df_us$date)
df_us
df_us <- df_us %>% select(!MyUnknownColumn) %>% select(!fips) %>% 
  group_by(date)
df_us
data.us.total <- df_us %>% group_by(date) %>%
  summarise(state ='US',
            cases = sum(deaths,na.rm=T),
            deaths = sum(deaths,na.rm=T))
`summarise()` ungrouping output (override with `.groups` argument)
View(data.us.total)
data.us.latest <- df_us %>%filter(date==max.date.us)
View(data.us.latest)
data.us.latest %<>% mutate(ranking = dense_rank(desc(cases))) %>%arrange(ranking) 
top.us <- data.us.latest[,2]
top.us
# List of top 20 state
k <- 19
data.us.top <- data.us.latest %>%
    filter(ranking <= k+1) %>% 
    arrange(ranking) 
View(data.us.top)
us.state.top <- data.us.top %>% pull(state) %>% as.character()
us.state.top  %>% setdiff('US') %>% print()
 [1] "California"     "Texas"          "Florida"        "Illinois"       "New York"      
 [6] "Ohio"           "Georgia"        "Pennsylvania"   "Tennessee"      "Michigan"      
[11] "Wisconsin"      "North Carolina" "Indiana"        "Arizona"        "New Jersey"    
[16] "Minnesota"      "Missouri"       "Massachusetts"  "Alabama"        "Virginia"      
View(us.state.top)
# confirmed rate & death rate of top 20 state
top_us <- data.us.latest %>% filter(state %in% us.state.top & state != "US")
View(top_us)
#gender in us
df_gender_us <- tbl(my_db, sql("select * from data_gender"))
df_gender_us <- as.data.frame(df_gender_us)
df_gender_us <- select(df_gender_us,c("State","Male","Female"))
df_gender_us <- rename(df_gender_us,"state"="State")
df_gender_us
#population in us
df_pop_us <- tbl(my_db, sql("select * from data_population"))
df_pop_us <- as.data.frame(df_pop_us)
df_pop_us <- select(df_pop_us,c("State","Population"))
df_pop_us <- rename(df_pop_us,"state"="State")
df_pop_us
#lockdown in us
df_lockdown_us <- tbl(my_db, sql("select * from data_lockdown"))
df_lockdown_us <- as.data.frame(df_lockdown_us)
df_lockdown_us <- select(df_lockdown_us,c("State","Day lockdown"))
df_lockdown_us <- rename(df_lockdown_us,"state"="State")
df_lockdown_us
#GDP in us
df_gdp_us <- tbl(my_db, sql("select * from gdp_us"))
df_gdp_us <- as.data.frame(df_gdp_us)
df_gdp_us <- select(df_gdp_us,c("State","GDPs"))
df_gdp_us <- rename(df_gdp_us,"state"="State")
df_gdp_us
#homeless in us
df_homeless_us <- tbl(my_db, sql("select * from us_homeless"))
df_homeless_us <- as.data.frame(df_homeless_us)
df_homeless_us <- select(df_homeless_us,c("State","Homeless"))
df_homeless_us <- rename(df_homeless_us,"state"="State")
df_homeless_us
#View(df_us)
#merge
mergcountry = function(data1,data2){
  data <- merge(x = data1, y = data2, by = "state", all.x = TRUE) 
  return(data)
}

df_Allus <- mergcountry(top_us, df_gender_us)
df_Allus <- mergcountry(df_Allus, df_pop_us)
df_Allus <- mergcountry(df_Allus, df_lockdown_us)
df_Allus <- mergcountry(df_Allus, df_gdp_us)
df_Allus <- mergcountry(df_Allus, df_homeless_us)
#df_Allus <- mergcountry(top_us, df_Allus)
View(df_Allus)
index <- is.na(df_Allus)
df_Allus[index] <- 0

normalize = function(data){
  #return ((data - min(data,na.rm = TRUE))/(max(data,na.rm = TRUE) - min(data,na.rm = TRUE)))
  z <- scale(data);
  tanh(z/2)
}
Allus = as.data.frame(apply(df_Allus[,3:11],2,normalize))
corr_dataUS <- Allus 

View(Allus)
Allus$state <- c(df_Allus$state)
Allus <- rename(Allus,"confirmed"="cases")
View(Allus)


Allus_plot <- select(Allus,"state","confirmed","deaths","Male","Female","Population","Day lockdown","GDPs","Homeless")
Allus_plot %<>% gather(key=type, value=count, -c(state))
level_order <- factor(Allus_plot$type, 
                      level = c("confirmed","deaths","Male","Female","Population","Day lockdown","GDPs","Homeless"))
ggplot(data = Allus_plot, aes(x=state, y=level_order, fill=count)) + 
  geom_tile() +
  scale_fill_gradient(low = "pink", high = "blue") +
  xlab("") +
  ylab("") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90,vjust = 1))+
  theme(
    axis.line = element_blank(),
    axis.ticks = element_blank(),
    panel.grid.minor = element_blank(),
    panel.grid.major = element_blank(),
    panel.border = element_blank(),
    panel.background = element_blank(),
    #legend.position = "none"
  ) +labs(title='The heatmap of COVID-19 infections in US')

NA
NA
NA
corr_dataUS <- rename(corr_dataUS,"Daylockdown"="Day lockdown")
#correlation
corr_dataUS %<>% select(c(cases,deaths,Male,Female,Population,Daylockdown,GDPs,Homeless))
head(corr_dataUS)
cor(corr_dataUS)
                   cases       deaths         Male       Female   Population Daylockdown        GDPs
cases        1.000000000  0.792654639  0.105367061 -0.005945092  0.959941301  -0.2058259  0.91047827
deaths       0.792654639  1.000000000  0.008041947  0.017335533  0.852691131  -0.3797468  0.88497061
Male         0.105367061  0.008041947  1.000000000 -0.877329594 -0.009961687  -0.2898630 -0.02437981
Female      -0.005945092  0.017335533 -0.877329594  1.000000000  0.069101784   0.3629328  0.02235864
Population   0.959941301  0.852691131 -0.009961687  0.069101784  1.000000000  -0.2155126  0.97052821
Daylockdown -0.205825865 -0.379746786 -0.289863040  0.362932815 -0.215512639   1.0000000 -0.36113782
GDPs         0.910478267  0.884970609 -0.024379811  0.022358644  0.970528214  -0.3611378  1.00000000
Homeless     0.791615898  0.796426563 -0.080155515  0.151488777  0.863178014  -0.3573560  0.89961070
               Homeless
cases        0.79161590
deaths       0.79642656
Male        -0.08015552
Female       0.15148878
Population   0.86317801
Daylockdown -0.35735597
GDPs         0.89961070
Homeless     1.00000000
ggcorrplot(cor(corr_dataUS),hc.order = TRUE,
           outline.color = "white",
           colors = c("#6D9EC1","white","#E46726"),
           lab = TRUE)

#View(df_us)
#merge
mergcountry = function(data1,data2){
  data <- merge(x = data1, y = data2, by = "state", all.x = TRUE) 
  return(data)
}

df.Allus <- mergcountry(df_us, df_gender_us)
df.Allus <- mergcountry(df.Allus, df_pop_us)
df.Allus <- mergcountry(df.Allus, df_lockdown_us)
df.Allus <- mergcountry(df.Allus, df_gdp_us)
df.Allus <- mergcountry(df.Allus, df_homeless_us)
#df_Allus <- mergcountry(top_us, df_Allus)
View(df_Allus)
index <- is.na(df.Allus)
df.Allus[index] <- 0

normalize = function(data){
  #return ((data - min(data,na.rm = TRUE))/(max(data,na.rm = TRUE) - min(data,na.rm = TRUE)))
  z <- scale(data);
  tanh(z/2)
}
Allus = as.data.frame(apply(df.Allus[,3:10],2,normalize))
corr_dataUS <- Allus 

View(Allus)
corr_dataUS <- rename(corr_dataUS,"Daylockdown"="Day lockdown")
#correlation
corr_dataUS %<>% select(c(cases,deaths,Male,Female,Population,Daylockdown,GDPs,Homeless))
head(corr_dataUS)
cor(corr_dataUS)
                cases    deaths      Male    Female Population Daylockdown      GDPs  Homeless
cases       1.0000000 0.8771162 0.1273080 0.1904992  0.6171785   0.2038334 0.6075674 0.5556301
deaths      0.8771162 1.0000000 0.1147696 0.1847547  0.6601563   0.2104893 0.6777292 0.6083036
Male        0.1273080 0.1147696 1.0000000 0.9513769  0.1956974   0.5064869 0.1828444 0.1390017
Female      0.1904992 0.1847547 0.9513769 1.0000000  0.2679789   0.5403218 0.2419723 0.1905770
Population  0.6171785 0.6601563 0.1956974 0.2679789  1.0000000   0.3394907 0.9855365 0.8879380
Daylockdown 0.2038334 0.2104893 0.5064869 0.5403218  0.3394907   1.0000000 0.3062257 0.2385315
GDPs        0.6075674 0.6777292 0.1828444 0.2419723  0.9855365   0.3062257 1.0000000 0.9114065
Homeless    0.5556301 0.6083036 0.1390017 0.1905770  0.8879380   0.2385315 0.9114065 1.0000000
ggcorrplot(cor(corr_dataUS),hc.order = TRUE,
           outline.color = "white",
           colors = c("#6D9EC1","white","#E46726"),
           lab = TRUE)

---
title: "R Notebook"
output: html_notebook
---
```{r}
#install.packages("knitr")
#install.packages("grid")
#install.packages("plotly")
#install.packages("dprep")
#install.packages("normalr")
#install.packages("ggcorrplot")

#install.packages("RColorBrewer")
#install.packages("rgdal")
#install.packages("jsonlite")
#install.packages("readr")
#install.packages("readr")
```

```{r}
library(gridExtra)
library(dplyr)
library(lubridate)
library(magrittr)
library(ggplot2)
library(tidyr)
library(knitr)
#library(normalr)
library(ggcorrplot)

library(leaflet)
library(plotly)
library(RColorBrewer)
library(readr)
#library(MLRMPA)
#??src_mysql
my_db <- src_mysql(
  dbname = "coronavirus",
  host = "localhost"
)
my_db

##import data
df_conf <- tbl(my_db, sql("select * from time_series_covid19_confirmed_global"))
df_conf <- as.data.frame(df_conf)
df_conf
df_deaths <- tbl(my_db, sql("select * from time_series_covid19_deaths_global"))
df_deaths <- as.data.frame(df_deaths)
df_deaths
df_recover <- tbl(my_db, sql("select * from time_series_covid19_recovered_global"))
df_recover <- as.data.frame(df_recover)
df_recover
```
```{r}
##check the time frame of the data
n.col <- ncol(df_conf)
dates <- names(df_conf)[5:n.col]%>% mdy()
range(dates)
min.date <- min(dates)
max.date <- max(dates)
min.date.txt <- min.date %>% format('%d %b %Y')
max.date.txt <- max.date %>% format('%d %b %Y')
```

```{r}
#clean data
cleanData <- function(data) {
  ## remove some columns
  data %<>% select(-c(Province.State, Lat, Long)) %>% rename(country=Country.Region)
  ## convert from wide to long format
  data %<>% gather(key=date, value=count, -country)
  ## convert from character to date
  data %<>% mutate(date = date %>% mdy())
  ## aggregate by country
  data %<>% group_by(country, date) %>% summarise(count=sum(count, na.rm=T)) %>% as.data.frame()
  return(data)
}
## clean the three data sets
data.confirmed <- df_conf %>% cleanData() %>% rename(confirmed=count)
data.deaths <- df_deaths %>% cleanData() %>% rename(deaths=count)
data.recovered <- df_recover %>% cleanData() %>% rename(recovered=count)
data <- data.confirmed %>% merge(data.deaths, all=T) %>% merge(data.recovered, all=T)
data
## countries/regions with confirmed cases, excl. cruise ships
countries <- data %>% pull(country) %>% setdiff('Cruise Ship')
data 
```


```{r}
data.world <- data %>% group_by(date) %>%
  summarise(country='World',
            confirmed = sum(confirmed, na.rm=T),
            deaths = sum(deaths, na.rm=T),
            recovered = sum(recovered, na.rm=T))
data %<>% rbind(data.world)
data
data %<>% mutate(current.confirmed = confirmed - deaths - recovered)
data

```

```{r}
#rate
data %<>% arrange(country, date)
n <- nrow(data)
day1 <- min(data$date)
data %<>% mutate(new.confirmed = ifelse(date == day1, NA, confirmed - lag(confirmed, n=1)),
                 new.deaths = ifelse(date == day1, NA, deaths - lag(deaths, n=1)),
                 new.recovered = ifelse(date == day1, NA, recovered - lag(recovered, n=1)))
data %<>% mutate(new.confirmed = ifelse(new.confirmed < 0, 0, new.confirmed),
                 new.deaths = ifelse(new.deaths < 0, 0, new.deaths),
                 new.recovered = ifelse(new.recovered < 0, 0, new.recovered))
## death rate based on total deaths and recovered cases
data %<>% mutate(rate.upper = (100 * deaths / (deaths + recovered)) %>% round(1))
## lower bound: death rate based on total confirmed cases
data %<>% mutate(rate.lower = (100 * deaths / confirmed) %>% round(1))
## death rate based on the number of death/recovered on every single day
data %<>% mutate(rate.daily = (100 * new.deaths / (new.deaths + new.recovered)) %>% round(1))
View(data)
```

```{r}
## convert from wide to long format
data.long <- data %>%
  select(c(country, date, confirmed, current.confirmed, recovered, deaths)) %>%
  gather(key=type, value=count, -c(country, date))
## set factor levels to show them in a desirable order
data.long %<>% mutate(type=recode_factor(type, confirmed='Total Confirmed',
                                         current.confirmed='Current Confirmed',
                                         recovered='Recovered',
                                         deaths='Deaths'))
View(data.long)
```

```{r}
##Number of case World
world <- filter(data.long,country == 'World')
plot1 <- world %>% filter(type != 'Total Confirmed') %>%
  ggplot(aes(x=date, y=count)) +
  geom_area(aes(fill=type), alpha=0.5) +
  labs(title=paste0('Numbers of Cases Worldwide - ', max.date.txt)) +
  scale_fill_manual(values=c('red', 'green', 'black')) +
  theme(legend.title=element_blank(), legend.position='bottom',
        plot.title = element_text(size=7),
        axis.title.x=element_blank(),
        axis.title.y=element_blank(),
        legend.key.size=unit(0.2, 'cm'),
        legend.text=element_text(size=6),
        axis.text=element_text(size=7),
        axis.text.x=element_text(angle=45, hjust=1))
plot2 <- world %>%
  ggplot(aes(x=date, y=count)) +
  geom_line(aes(color=type)) +
  labs(title=paste0('Numbers of Cases Worldwide (log scale) - ', max.date.txt)) +
  scale_color_manual(values=c('purple', 'red', 'green', 'black')) +
  theme(legend.title=element_blank(), legend.position='bottom',
        plot.title = element_text(size=14),
        axis.title.x=element_blank(),
        axis.title.y=element_blank(),
        legend.key.size=unit(0.2, 'cm'),
        legend.text=element_text(size=14),
        axis.text=element_text(size=14),
        axis.text.x=element_text(angle=45, hjust=1)) +
  scale_y_continuous(trans='log10')
## show two plots side by side
grid.arrange(plot1, plot2, ncol=2)
```

```{r}
## Current Confirmed Cases
data.world <- data %>% filter(country=='World')
n <- nrow(data.world)
plot1 <- ggplot(data.world, aes(x=date, y=current.confirmed)) +
  geom_point() + geom_smooth() +
  xlab('') + ylab('Count') + labs(title='Current Confirmed Cases') +
  theme(axis.text.x=element_text(angle=45, hjust=1))
plot2 <- ggplot(data.world, aes(x=date, y=new.confirmed)) +
  geom_point() + geom_smooth() +
  xlab('') + ylab('Count') + labs(title='Daily New Confirmed Cases') +
  theme(axis.text.x=element_text(angle=45, hjust=1))
## show two plots side by side
grid.arrange(plot1, plot2, ncol=2)
View(data.world)
```

```{r}
## a scatter plot with a smoothed line and vertical x-axis labels
plot1 <- ggplot(data.world, aes(x=date, y=deaths)) +
  geom_point() + geom_smooth() +
  xlab('') + ylab('Count') + labs(title='Accumulative Deaths') +
  theme(axis.text.x=element_text(angle=45, hjust=1))
plot2 <- ggplot(data.world, aes(x=date, y=recovered)) +
  geom_point() + geom_smooth() +
  xlab('') + ylab('Count') + labs(title='Accumulative Recovered Cases') +
  theme(axis.text.x=element_text(angle=45, hjust=1))
plot3 <- ggplot(data.world, aes(x=date, y=new.deaths)) +
  geom_point() + geom_smooth() +
  xlab('') + ylab('Count') + labs(title='New Deaths') +
  theme(axis.text.x=element_text(angle=45, hjust=1))
plot4 <- ggplot(data.world, aes(x=date, y=new.recovered)) +
  geom_point() + geom_smooth() +
  xlab('') + ylab('Count') + labs(title='New Recovered Cases') +
  theme(axis.text.x=element_text(angle=45, hjust=1))
## show four plots together, with 2 plots in each row
grid.arrange(plot1, plot2, plot3, plot4, nrow=2)
```



```{r}
## convert from wide to long format, for drawing area plots
rates.long <- data %>%
  select(c(country, date, rate.upper, rate.lower, rate.daily)) %>%
  gather(key=type, value=count, -c(country, date))
# set factor levels to show them in a desirable order
rates.long %<>% mutate(type=recode_factor(type, rate.daily='Daily',
                             rate.upper='Upper bound'))
```

```{r}
## ranking by confirmed cases
data.latest.all <- data %>% filter(date == max(date)) %>%
  select(country, date,confirmed, new.confirmed, current.confirmed,
         recovered, deaths, new.deaths, death.rate=rate.lower) %>%
  mutate(ranking = dense_rank(desc(confirmed)))
#View(data.latest.all)
k <- 20
## top 20 countries: 21 incl. 'World'
top.countries <- data.latest.all %>% filter(ranking <= k + 1) %>%
  arrange(ranking) %>% pull(country) %>% as.character()
top.countries %>% setdiff('World') %>% print()

```

```{r}
data.latest <- data.latest.all %>% filter(!is.na(country)) %>%
  mutate(country=ifelse(ranking <= k + 1, as.character(country), 'Others')) %>%
  mutate(country=country %>% factor(levels=c(top.countries, 'Others')))
data.latest %<>% group_by(country) %>%
  summarise(confirmed=sum(confirmed), new.confirmed=sum(new.confirmed),
            current.confirmed=sum(current.confirmed),
            recovered=sum(recovered), deaths=sum(deaths), new.deaths=sum(new.deaths)) %>%
  mutate(death.rate=(100 * deaths/confirmed) %>% round(1)) 
data.latest
data.latest %<>% select(c(country, confirmed, deaths, death.rate,
                          new.confirmed, new.deaths, current.confirmed,recovered)) %>%
  mutate(recover.rate=(100 * recovered/confirmed) %>% round(1))
data.latest
df_pop <- tbl(my_db, sql("select * from population "))
df_pop <- as.data.frame(df_pop)
df_pop <- rename(df_pop,"country"="Country")
df_pop
data.latest <- merge(x = data.latest, y = df_pop, by = "country", all.x = TRUE) 
data.latest <- rename(data.latest,"population" = "Population (2020)")
data.latest
data.latest  %<>% select(c(country, confirmed, deaths, death.rate,
                          new.confirmed, new.deaths,
                          current.confirmed,recovered,recover.rate,population)) %>%
  mutate(confirm.rate=(100 *confirmed/population) %>% round(1))
data.latest

```
```{r}
data.latest %>% mutate(death.rate=death.rate %>% format(nsmall=1) %>% paste0('%'))


```

```{r}
## convert from wide to long format, for drawing area plots
data.latest.long <- data.latest %>% filter(country!='World') %>%
  gather(key=type, value=count, -country)
## set factor levels to show them with proper text and in a desirable order
data.latest.long %<>% mutate(type=recode_factor(type,
                                                confirmed='Total Confirmed',
                                                deaths='Total Deaths',
                                                death.rate='Death Rate (%)',
                                                new.confirmed='New Confirmed (compared with one day before)',
                                                new.deaths='New Deaths (compared with one day before)',
                                                current.confirmed='Current Confirmed',
                                                recover.rate = 'Recover Rate(%)',
                                                confirm.rate = 'Confirmed Rate(%)'))
#View(data.latest.long)
data.one.dem <- filter(data.latest.long,type=='Total Confirmed'
                       | type=='Total Deaths'
                       | type=='Current Confirmed')
data.two.dem <- filter(data.latest.long,type=='Confirmed Rate(%)'
                       #| type=='New Confirmed (compared with one day before)'
                       #| type=='New Deaths (compared with one day before)'
                       | type=='Death Rate (%)'
                       | type=='Recover Rate(%)')
data.two.dem
```

```{r}
## bar chart
data.one.dem %>% ggplot(aes(x=country, y=count, fill=country, group=country)) +
  geom_bar(stat='identity') +
  geom_text(aes(label=count, y=count), size=2, vjust=0) +
  xlab('') + ylab('') +
  labs(title=paste0('Top 20 Countries with Most Confirmed Cases - ', max.date.txt)) +
  scale_fill_discrete(name='Country', labels=aes(count)) +
  theme(legend.title=element_blank(),
        legend.position='none',
        plot.title=element_text(size=11),
        axis.text=element_text(size=7),
        axis.text.x=element_text(angle=45, hjust=1)) +
  facet_wrap(~type, ncol=1, scales='free_y')

```
```{r}
data.two.dem$facet <- factor(data.two.dem$type, levels = c('Confirmed Rate(%)', 'Recover Rate(%)','Death Rate (%)'))
data.two.dem %>% 
  ggplot(aes(x=country, y=count, fill=country, group=country)) +
  geom_bar(stat='identity') +
  geom_text(aes(label=count, y=count), size=2, vjust=0) +
  xlab('') + ylab('') +
  labs(title=paste0('Top 20 Countries with Most Confirmed Cases - ', max.date.txt)) +
  scale_fill_discrete(name='Country', labels=aes(count)) +
  theme(legend.title=element_blank(),
        legend.position='none',
        plot.title=element_text(size=11),
        axis.text=element_text(size=9),
        axis.text.x=element_text(size=6,angle=45, hjust=1)) +
  facet_wrap(~facet, ncol=1, scales='free_y')
```

```{r}
#df_gdp
df_gdp2019 <- tbl(my_db, sql("select * from gdp19"))
df_gdp2019 <- as.data.frame(df_gdp2019)
df_gdp2019

```
```{r}
#healthranking
df_healt <- tbl(my_db, sql("select * from healthranking"))
df_healt <- as.data.frame(df_healt)
df_healt <- select(df_healt,c("country","healthCareIndex"))
df_healt
```

```{r}
#Top20Pornhub
df_pornhub <- tbl(my_db, sql("select * from Pornhub"))
df_pornhub <- as.data.frame(df_pornhub)
df_pornhub

```


```{r}
#temp
df_temp <- tbl(my_db, sql("select * from Avg_World_Temp_2020"))
df_temp <- as.data.frame(df_temp)
df_city <- select(df_temp,c("Country","City")) %>%
  rename(country=Country) %>% 
  rename(city=City)
numofcity <- aggregate(city ~ country, data = df_city, length)
df_temp <- select(df_temp,c("Country","Apr","May","Jun","Jul","Aug")) %>%
  rename(country=Country)
df_temp <- data.frame(country=df_temp[,1],avg=rowMeans(df_temp[,-1]))
df_temp <- df_temp %<>% group_by(country) %>% summarise(avg_temp = mean(avg,na.rm = TRUE))
df_temp <- df_temp %>% mutate(country=ifelse(country=="United States","US", country ) ) 
df_temp$avg_temp <- df_temp$avg_temp %>% 
  sprintf(df_temp$avg_temp, fmt = '%#.1f') %>%
  as.numeric(df_temp$avg_temp)
df_temp
```



```{r}
#Top 20 with gdp
data.longGDP <- df_gdp %>% gather(key=year, value=GDP, -c(country))
data.top <- data.latest %>% filter(country!='World')
data.top <- head(data.top,20)
View(data.latest)
#merge
mergcountry = function(data1,data2){
  data <- merge(x = data1, y = data2, by = "country", all.x = TRUE) 
  return(data)
}
data.top.world <- merge(x = data.top, y = df_gdp2019, by = "country", all.x = TRUE) %>% 
  select(-c(code,rank,new.confirmed,new.deaths,current.confirmed,population)) %>% 
  rename(GDP="GDP (millions of US dollars)")

data.top.world <- merge(x = data.top.world, y = df_healt, by = "country", all.x = TRUE) %>%
  rename(healthcare="healthCareIndex")
#data.top.world <- mergcountry(data.top.world, df_temp)

data.top.world <- merge(x = data.top.world, y = df_pornhub, by = "country", all.x = TRUE) %>%
  rename(Pornhub = "PornhubIndex(%)")

data.top.world <- mergcountry(data.top.world, df_temp)
index <- is.na(data.top.world)
data.top.world[index] <- 0
data.top.world
View(data.top.world)

normalize = function(data){
  #return ((data - min(data,na.rm = TRUE))/(max(data,na.rm = TRUE) - min(data,na.rm = TRUE)))
  z <- scale(data);
  tanh(z/2)
}
norm_data = as.data.frame(apply(data.top.world[,2:12],2,normalize))
corr_data <- norm_data
norm_data$country <- c("Argentina","Bangladesh","Brazil","Chile","Colombia","France","Germany","India","Iran","Italy","Mexico","Pakistan","Peru","Russia","saudi Arabia","South Africa","Spain","Turkey","United Kingdom","US")
#View(norm_data)


norm_data_plot <- select(norm_data,"country","confirm.rate","death.rate","recover.rate","healthcare","Pornhub","GDP","avg_temp")
norm_data_plot %<>% gather(key=type, value=count, -c(country))
level_order <- factor(norm_data_plot$type, 
                      level = c("GDP","avg_temp","healthcare","recover.rate","death.rate","confirm.rate","Pornhub"))
ggplot(data = norm_data_plot, aes(x=country, y=level_order, fill=count)) + 
  geom_tile() +
  scale_fill_gradient(low = "pink", high = "blue") +
  xlab("") +
  ylab("") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90,vjust = 1))+
  theme(
    axis.line = element_blank(),
    axis.ticks = element_blank(),
    panel.grid.minor = element_blank(),
    panel.grid.major = element_blank(),
    panel.border = element_blank(),
    panel.background = element_blank(),
    #legend.position = "none"
  )

  
```






```{r}
#correlation
corr_data %<>% select(c(GDP,confirm.rate,death.rate,recover.rate,healthcare,avg_temp,Pornhub))
head(corr_data)
cor(corr_data)
ggcorrplot(cor(corr_data),hc.order = TRUE,
           outline.color = "white",
           colors = c("#6D9EC1","white","#E46726"),
           lab = TRUE)
```
```{r}
df <- data.long %>% filter(country %in% top.countries) %<>%
  mutate(country=country %>% factor(levels=c(top.countries)))
df %>% filter(country != 'World' & type != 'Total Confirmed') %>%
  ggplot(aes(x=date, y=count, fill=type)) +
  geom_area(alpha=0.5) +
# xlab('') + ylab('') +
  labs(title=paste0('Numbers of COVID-19 Cases in Top 20 Countries - ',
                    max.date.txt)) +
  scale_fill_manual(values=c('red', 'green', 'black')) +
  theme(legend.title=element_blank(), legend.position='bottom',
        plot.title = element_text(size=12),
        axis.title.x=element_blank(),
        axis.title.y=element_blank(),
        legend.key.size=unit(0.4, 'cm'),
        legend.text=element_text(size=12),
        strip.text.x=element_text(size=12),
        axis.text=element_text(size=12),
        axis.text.x=element_text(angle=45, hjust=1)) +
  facet_wrap(~country, ncol=4, scales='free_y')

```
```{r}
p <- df %>% filter(country != 'World') %>%
  ggplot(aes(x=date, y=count, color=type)) +
  geom_line() +
  labs(title=paste0('Numbers of COVID-19 Cases in Top 20 Countries (log scale) - ',
                    max.date.txt)) +
  scale_color_manual(values=c('purple', 'red', 'green', 'black')) +
  theme(legend.title=element_blank(), legend.position='bottom',
        plot.title = element_text(size=10),
        axis.title.x=element_blank(),
        axis.title.y=element_blank(),
        legend.key.size=unit(0.4, 'cm'),
        legend.text=element_text(size=10),
        strip.text.x=element_text(size=10),
        axis.text=element_text(size=10),
        axis.text.x=element_text(angle=45, hjust=1)) +
  scale_y_continuous(trans='log10')
p + facet_wrap(~country, ncol=4, scales='free_y')
```
```{r}
data.world %<>% arrange(desc(date)) %>%
  select(c(date, confirmed, deaths, recovered, current.confirmed,new.confirmed, new.deaths, new.recovered, rate.lower, rate.upper, rate.daily))
data.world %>%
  mutate(rate.upper = rate.upper %>% format(nsmall=1) %>% paste0('\\%'),
         rate.lower = rate.lower %>% format(nsmall=1) %>% paste0('\\%'),
         rate.daily = rate.daily %>% format(nsmall=1) %>% paste0('\\%')) 
```
```{r}
#sars_2003
df_sars <- tbl(my_db, sql("select * from sars_2003_update"))
df_sars <- as.data.frame(df_sars)
df_sars
```


```{r}
## convert from character to date
#datesSar <- as.Date(df_sars$Date,format = "%m/%d/%y")

df_sars %<>%  mutate(Date = as.Date(df_sars$Date,format = "%m/%d/%y"))
df_sars
```

```{r}
## convert from wide to long format
dataSar.long <- df_sars %>%
  select(c(Date, country, Cumulative_number , Number_deaths, Number_recovered)) %>%
  gather(key=type, value=count, -c(country, Date))
## set factor levels to show them in a desirable order
dataSar.long %<>% mutate(type=recode_factor(type, Cumulative_number ='Cumulative Number',
                                         Number_deaths ='Number of deaths',
                                         Number_recovered ='Number of recovered'))
View(dataSar.long)
 
```






```{r}
#Covid_Thailand
df_thai <- tbl(my_db, sql("select * from covid19_Thailand"))
df_thai <- as.data.frame(df_thai)
View(df_thai)
```

```{r}
#clean Covid_Thailand
dates.th <- df_thai[,2]%>% mdy()
range(dates.th)
min.date.th <- min(dates.th)
max.date.th <- max(dates.th)
min.date.txt.th <- min.date.th %>% format('%d %b %Y')
max.date.txt.th <- max.date.th %>% format('%d %b %Y')
```
```{r}
df_thai$announce_date <- mdy(df_thai$announce_date)
df_thai$notification_date <- mdy(df_thai$notification_date)
df_thai 
```
```{r}
df_thai <- df_thai %>% select(!No.) %>% select(!notification_date) %>% 
  group_by(announce_date)
df_thai
```
```{r}
# Total confirmed cases in Thailand
data.thai.count <- df_thai %>%
  select(announce_date) %>%
  summarise(comfirmed = n())  %>% as.data.frame()
data.thai.count$cumulative_confirmed <- cumsum(data.thai.count[, 2])
data.thai.count
```


```{r}
## Thai Confirmed Cases (Jan 2020 - Jan 2021
plot1 <- ggplot(data.thai.count, aes(x=announce_date, y=cumulative_confirmed)) +
  geom_point() + geom_smooth() +
  xlab(" ") + ylab("Count") + labs(title='Thai Cumulative Confirmed Cases (Jan 2020 - Jan 2021)') +
  theme(axis.text.x=element_text(angle=45, hjust=1))
plot2 <- ggplot(data.thai.count, aes(x=announce_date, y=comfirmed)) +
  geom_point() + geom_smooth() +
    xlab(" ") + ylab("Count")+ labs(title='Thai Confirmed Cases (Jan 2020 - Jan 2021 log scale)') +
   theme(axis.text.x=element_text(angle=45, hjust=1))+scale_y_continuous(trans='log10')
## show two plots side by side
grid.arrange(plot1, plot2, ncol=1)
```
```{r}
## Thai Confirmed Cases (Jan 2020 - Jan 2021) log scale
plot1 <- ggplot(data.thai.count, aes(x=announce_date, y=cumulative_confirmed)) +
  geom_point() + geom_smooth() +
  xlab(" ") + ylab("Count") + labs(title='Thai Cumulative Confirmed Cases (Jan 2020 - Jan 2021 log scale)') +
  theme(axis.text.x=element_text(angle=45, hjust=1))+scale_y_continuous(trans='log10')
plot2 <- ggplot(data.thai.count, aes(x=announce_date, y=comfirmed)) +
  geom_point() + geom_smooth() +
  xlab(" ") + ylab("Count")+ labs(title='Thai Confirmed Cases (Jan 2020 - Jan 2021 log scale)') +
  theme(axis.text.x=element_text(angle=45, hjust=1))+scale_y_continuous(trans='log10')
## show two plots side by side
grid.arrange(plot1, plot2, ncol=1)
```

```{r}
# Confirmed cases divided by sex (gender)
df_thai$sex[df_thai$sex == ""] <- "Unknown"
data.thai.gender <- df_thai %>%
  group_by(sex) %>%
  summarise(count = n()) %>%
  mutate(percent = (count / sum(count) * 100) %>% round(2)) %>%
  filter(percent>1)%>%
  #mutate(pos = cumsum(percent) - 0.5*percent) %>%
  arrange(desc(percent))

data.thai.gender
```

```{r}
data.thai.gender$sex <- factor(data.thai.gender$sex, levels = as.character(data.thai.gender$sex))
data.thai.gender$sex
g.th.gender <- data.thai.gender %>% 
  ggplot(aes(x = "", y = percent, fill = sex)) +
  geom_bar(stat = "identity", width = 1) +
  coord_polar("y") +
  theme_void() +
  labs(title='Gender of Thai Confirmed Cases (Jan 2020 - Jan 2021)')+
  geom_text(aes(label = paste0(percent, "%")), color = "white", size = 5, position = position_stack(vjust = 0.5)) +
  guides(fill = guide_legend(reverse = TRUE)) 
g.th.gender
```

```{r}
# Confirmed cases divided by province_of_onset
data.thai.onset <- df_thai %>%
    group_by(province_of_onset) %>%
    summarise(count = n()) %>%
    arrange(desc(count))%>%
    rename(onset="count")%>%
    rename(province="province_of_onset")
data.thai.onset$province[data.thai.onset$province == ""] <- "Unknown"
data.thai.onset
```
```{r}
# Confirmed cases divided by province_of_isolation 
data.thai.isolation <- df_thai %>%
    group_by(province_of_isolation) %>%
    summarise(count = n()) %>%
    arrange(desc(count))%>%
    rename(isolation ="count")%>%
    rename(province="province_of_isolation")

data.thai.isolation$province[data.thai.isolation$province == ""] <- "Unknown"
data.thai.isolation
```
```{r}
data.province <- merge(x = data.thai.onset, y = data.thai.isolation, by = "province", all.x = TRUE) 
data.province <- data.province%>%filter(onset>125)
data.province
```
```{r}
data.province.long <- data.province %>%
    gather(key=type, value=count, -c(province))
data.province.long
```

```{r}
ggplot(data.province.long,aes(x=province,y=count))+
    geom_bar(stat = "identity",position = "dodge",aes(fill=type))+
    labs(title = "Province of onset and Province of isolation in Thailand")+
    theme(legend.title=element_blank(),
                  #legend.position='none',
                  plot.title=element_text(size=12),
                  axis.text=element_text(size=10),
                  axis.text.x=element_text(angle=45, hjust=1))+ xlab('') + ylab('Count')
```






```{r}
# Confirmed cases divided by risk
data.thai.risk <- df_thai %>%
  group_by(risk) %>%
  summarise(count = n()) %>%
  arrange(desc(count))
data.thai.risk
```

```{r}
# Confirmed cases divided by risk
data.thai.risk <- df_thai %>%
  group_by(risk) %>%
  summarise(count = n()) %>%
  mutate(percent = (count / sum(count) * 100) %>% round(2)) %>%
  filter(percent>0.76) %>%
  arrange(desc(percent)) 
data.thai.risk
```
```{r}
data.thai.risk$risk[data.thai.risk$risk == "C"] <- "Close contact with the patient"
data.thai.risk$risk[data.thai.risk$risk == "CGustDr Samut SaGGAF"] <- "Cluster Samut Sakhon"
data.thai.risk$risk[data.thai.risk$risk == "F"] <- "State Quarantine"
data.thai.risk$risk[data.thai.risk$risk == "G"] <- "Go to a crowded place"
data.thai.risk$risk[data.thai.risk$risk == "A"] <- "People travelling from abroad"
data.thai.risk$risk[data.thai.risk$risk == "UFGFAwF"] <- "Unknown"
data.thai.risk$risk[data.thai.risk$risk == "D"] <- "Career at risk"
data.thai.risk$risk[data.thai.risk$risk == "CGustDr RayAFg"] <- "Cluster Rayong"
data.thai.risk$risk[data.thai.risk$risk == "CGustDr Pattaya CasCFA"] <- "Cluster Pattaya Casino"
data.thai.risk$risk[data.thai.risk$risk == "CGustDr AFg TGAFg"] <- "Cluster Ang Thong"
data.thai.risk
```




```{r}
data.thai.risk$risk <- factor(data.thai.risk$risk, levels = as.character(data.thai.risk$risk))
data.thai.risk$risk
g.th.risk <- data.thai.risk %>% 
  ggplot(aes(x = "", y = percent, fill = risk)) +
  geom_bar(stat = "identity", width = 0.5) +
  coord_polar("y") +
  theme_void() +
  labs(title='Risk of Thai Confirmed Cases(Jan 2020 - Jan 2021)')+
  geom_text(aes(label = paste0(percent, "%")), color = "Black", size = 3, position = position_stack(vjust = 0.5)) +
  guides(fill = guide_legend(reverse = TRUE)) 
g.th.risk
```



```{r}
# Confirmed cases divided by age
data.thai.age <- df_thai %>%
  group_by(age,sex) %>% 
  filter(age != 0)%>%
  summarise(count = n()) %>%
  arrange(desc(count))
data.thai.age
```
```{r}
ggplot(data.thai.age,aes(x=age,y=count,fill=sex))+geom_bar(stat = "identity")+
      labs(title='Age of Thai Confirmed Cases (Stack)')+guides(fill=guide_legend(reverse = T))

```

```{r}
# Confirmed cases divided by nationality


data.thai.nationality <- df_thai %>%
  group_by(nationality) %>%
  summarise(count = n()) %>%
  filter(count > 11)%>%
  arrange(desc(count))
data.thai.nationality$nationality[data.thai.nationality$nationality == "????????"] <- "Unknown"
data.thai.nationality$nationality[data.thai.nationality$nationality == ""] <- "Unknown"
data.thai.nationality$nationality[data.thai.nationality$nationality == "Burma"] <- "Myanmar"
data.thai.nationality
```
```{r}
ggplot(data.thai.nationality)+ 
  geom_bar(aes(x=nationality,y=count,fill = count),stat = "identity",show.legend = F)+
    ggtitle('Nationality of Thai Confirmed Cases (log scale)')+
    theme(plot.title = element_text(hjust = 0.5))+
    theme(axis.text = element_text(size = 10),
            axis.title = element_text(size = 10),
            plot.title = element_text(size = 10))+
    xlab("")+ylab("Number of Nationality")+
   scale_y_continuous(trans='log10')+coord_flip()+scale_fill_gradient(low="blue",high = "red")
```



```{r}
#Covid_US
df_us <- tbl(my_db, sql("select * from covidUs"))
df_us <- as.data.frame(df_us)
df_us
```

```{r}
#clean Covid_US
dates.us <- df_us[,2]%>% mdy()
range(dates.us)
min.date.us <- min(dates.us)
max.date.us <- max(dates.us)
min.date.txt.us <- min.date.us %>% format('%d %b %Y')
max.date.txt.us <- max.date.us %>% format('%d %b %Y')
```

```{r}
df_us$date <- mdy(df_us$date)
df_us
```


```{r}
df_us <- df_us %>% select(!MyUnknownColumn) %>% select(!fips) %>% 
  group_by(date)
df_us
```


```{r}
data.us.total <- df_us %>% group_by(date) %>%
  summarise(state ='US',
            cases = sum(deaths,na.rm=T),
            deaths = sum(deaths,na.rm=T))
View(data.us.total)
```


```{r}
data.us.latest <- df_us %>%filter(date==max.date.us)
View(data.us.latest)
```

```{r}
data.us.latest %<>% mutate(ranking = dense_rank(desc(cases))) %>%arrange(ranking) 
top.us <- data.us.latest[,2]
top.us
```
```{r}
# List of top 20 state
k <- 19
data.us.top <- data.us.latest %>%
    filter(ranking <= k+1) %>% 
    arrange(ranking) 
View(data.us.top)
us.state.top <- data.us.top %>% pull(state) %>% as.character()
us.state.top  %>% setdiff('US') %>% print()
View(us.state.top)
```
```{r}
# confirmed rate & death rate of top 20 state
top_us <- data.us.latest %>% filter(state %in% us.state.top & state != "US")
View(top_us)
```






```{r}
#gender in us
df_gender_us <- tbl(my_db, sql("select * from data_gender"))
df_gender_us <- as.data.frame(df_gender_us)
df_gender_us <- select(df_gender_us,c("State","Male","Female"))
df_gender_us <- rename(df_gender_us,"state"="State")
df_gender_us
```
```{r}
#population in us
df_pop_us <- tbl(my_db, sql("select * from data_population"))
df_pop_us <- as.data.frame(df_pop_us)
df_pop_us <- select(df_pop_us,c("State","Population"))
df_pop_us <- rename(df_pop_us,"state"="State")
df_pop_us
```
```{r}
#lockdown in us
df_lockdown_us <- tbl(my_db, sql("select * from data_lockdown"))
df_lockdown_us <- as.data.frame(df_lockdown_us)
df_lockdown_us <- select(df_lockdown_us,c("State","Day lockdown"))
df_lockdown_us <- rename(df_lockdown_us,"state"="State")
df_lockdown_us
```

```{r}
#GDP in us
df_gdp_us <- tbl(my_db, sql("select * from gdp_us"))
df_gdp_us <- as.data.frame(df_gdp_us)
df_gdp_us <- select(df_gdp_us,c("State","GDPs"))
df_gdp_us <- rename(df_gdp_us,"state"="State")
df_gdp_us
```

```{r}
#homeless in us
df_homeless_us <- tbl(my_db, sql("select * from us_homeless"))
df_homeless_us <- as.data.frame(df_homeless_us)
df_homeless_us <- select(df_homeless_us,c("State","Homeless"))
df_homeless_us <- rename(df_homeless_us,"state"="State")
df_homeless_us
```


```{r}
#View(df_us)
#merge
mergcountry = function(data1,data2){
  data <- merge(x = data1, y = data2, by = "state", all.x = TRUE) 
  return(data)
}

df_Allus <- mergcountry(top_us, df_gender_us)
df_Allus <- mergcountry(df_Allus, df_pop_us)
df_Allus <- mergcountry(df_Allus, df_lockdown_us)
df_Allus <- mergcountry(df_Allus, df_gdp_us)
df_Allus <- mergcountry(df_Allus, df_homeless_us)
#df_Allus <- mergcountry(top_us, df_Allus)
View(df_Allus)

```




```{r}
index <- is.na(df_Allus)
df_Allus[index] <- 0

normalize = function(data){
  #return ((data - min(data,na.rm = TRUE))/(max(data,na.rm = TRUE) - min(data,na.rm = TRUE)))
  z <- scale(data);
  tanh(z/2)
}
Allus = as.data.frame(apply(df_Allus[,3:11],2,normalize))
corr_dataUS <- Allus 

View(Allus)

```

```{r}
Allus$state <- c(df_Allus$state)
Allus <- rename(Allus,"confirmed"="cases")
View(Allus)
```


```{r}


Allus_plot <- select(Allus,"state","confirmed","deaths","Male","Female","Population","Day lockdown","GDPs","Homeless")
Allus_plot %<>% gather(key=type, value=count, -c(state))
level_order <- factor(Allus_plot$type, 
                      level = c("confirmed","deaths","Male","Female","Population","Day lockdown","GDPs","Homeless"))
ggplot(data = Allus_plot, aes(x=state, y=level_order, fill=count)) + 
  geom_tile() +
  scale_fill_gradient(low = "pink", high = "blue") +
  xlab("") +
  ylab("") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90,vjust = 1))+
  theme(
    axis.line = element_blank(),
    axis.ticks = element_blank(),
    panel.grid.minor = element_blank(),
    panel.grid.major = element_blank(),
    panel.border = element_blank(),
    panel.background = element_blank(),
    #legend.position = "none"
  ) +labs(title='The heatmap of COVID-19 infections in US')



```

```{r}
corr_dataUS <- rename(corr_dataUS,"Daylockdown"="Day lockdown")
```

```{r}
#correlation
corr_dataUS %<>% select(c(cases,deaths,Male,Female,Population,Daylockdown,GDPs,Homeless))
head(corr_dataUS)
cor(corr_dataUS)
ggcorrplot(cor(corr_dataUS),hc.order = TRUE,
           outline.color = "white",
           colors = c("#6D9EC1","white","#E46726"),
           lab = TRUE)
```



```{r}
#View(df_us)
#merge
mergcountry = function(data1,data2){
  data <- merge(x = data1, y = data2, by = "state", all.x = TRUE) 
  return(data)
}

df.Allus <- mergcountry(df_us, df_gender_us)
df.Allus <- mergcountry(df.Allus, df_pop_us)
df.Allus <- mergcountry(df.Allus, df_lockdown_us)
df.Allus <- mergcountry(df.Allus, df_gdp_us)
df.Allus <- mergcountry(df.Allus, df_homeless_us)
#df_Allus <- mergcountry(top_us, df_Allus)
View(df_Allus)

```

```{r}
index <- is.na(df.Allus)
df.Allus[index] <- 0

normalize = function(data){
  #return ((data - min(data,na.rm = TRUE))/(max(data,na.rm = TRUE) - min(data,na.rm = TRUE)))
  z <- scale(data);
  tanh(z/2)
}
Allus = as.data.frame(apply(df.Allus[,3:10],2,normalize))
corr_dataUS <- Allus 

View(Allus)
```




```{r}
corr_dataUS <- rename(corr_dataUS,"Daylockdown"="Day lockdown")
```

```{r}
#correlation
corr_dataUS %<>% select(c(cases,deaths,Male,Female,Population,Daylockdown,GDPs,Homeless))
head(corr_dataUS)
cor(corr_dataUS)
ggcorrplot(cor(corr_dataUS),hc.order = TRUE,
           outline.color = "white",
           colors = c("#6D9EC1","white","#E46726"),
           lab = TRUE)
```
























